%run set_theme.ipynb
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode()
df = pd.read_parquet('../data/SO_2014_2022.pq')
df = df[(df['Salary'] > 0) & (df['Salary'] < 250000)]
df.head()
| Year | Salary | JobSat | YearsCode | YearsCodePro | Age | Education | OrgSize | LastNewJob | Employment | RespondentType | JobSeek | Gender | Student | Country | CodingActivities | DevType | LearnCodeFrom | LangPresent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 69318.0 | <NA> | 10 | 5 | 25-34 | master | 500 to 999 employees | <NA> | fulltime | dev | <NA> | male | no | Germany | School or academic work | Data scientist or machine learning specialist;... | Books / Physical media;School (i.e., Universit... | C;C++;Java;JavaScript;MATLAB;Python;Scala;SQL;... |
| 6 | 2022 | 27652.0 | <NA> | 18 | 10 | 25-34 | bachelor | 1,000 to 4,999 employees | <NA> | fulltime | dev | <NA> | male | no | Colombia | Hobby | Developer, full-stack;Developer, back-end | Books / Physical media;Other online resources ... | Bash/Shell/PowerShell;Elixir;HTML/CSS;JavaScri... |
| 9 | 2022 | 15431.0 | <NA> | 5 | 5 | 25-34 | bachelor | 20 to 99 employees | <NA> | fulltime | dev | <NA> | male | no | Ghana | Freelance/contract work | Developer, back-end | On the job training;Coding Bootcamp | JavaScript;Ruby |
| 13 | 2022 | 47352.0 | <NA> | 7 | 7 | 45-54 | master | 10 to 19 employees | <NA> | fulltime | non-dev | <NA> | male | no | Belgium | Hobby | Developer, back-end;Educator or academic;Datab... | Books / Physical media;On the job training;Col... | Delphi;SQL |
| 22 | 2022 | 78084.0 | <NA> | 25 | 25 | 45-54 | bachelor | 500 to 999 employees | <NA> | fulltime | non-dev | <NA> | male | no | Canada | Hobby;Contribute to open-source projects | Engineer, site reliability;Security professional | Books / Physical media;Other online resources ... | Bash/Shell/PowerShell;C;JavaScript;Perl;PHP;Py... |
# Show the top-n countries in the plot.
TOP_N_COUNTRIES = 10
# Take the top-n countries that have the most developers.
countries = df[['Country', 'RespondentType']] \
.query('RespondentType == "dev"') \
.query('Country != "Brazil"') \
.query('Country != "India"') \
.query('Country != "Russia"') \
.query('Country != "Sweden"') \
.groupby('Country', as_index=False) \
.count() \
.sort_values('RespondentType', ascending=False) \
.head(TOP_N_COUNTRIES) \
['Country'] \
.tolist()
countries_df = df[df['Country'].isin(countries)]
# Calculate the salary gap in percentage.
salary_men_df = countries_df.query('Gender == "male"') \
.groupby(['Year', 'Country']) \
.agg({'Salary': 'mean' }) \
.reset_index() \
.rename(columns={'Salary': 'SalaryMen'})
salary_women_df = countries_df.query('Gender == "female"') \
.groupby(['Year', 'Country']) \
.agg({'Salary': 'mean' }) \
.reset_index() \
.rename(columns={'Salary': 'SalaryWomen'})
salary_gap_df = pd.merge(salary_men_df, salary_women_df, on=['Year', 'Country']).dropna()
salary_gap_df['SalaryGap'] = abs(salary_gap_df['SalaryMen'] - salary_gap_df['SalaryWomen'])
salary_gap_df['SalaryGapPercent'] = (salary_gap_df['SalaryMen'] - salary_gap_df['SalaryWomen']) / salary_gap_df['SalaryMen'] * 100
salary_gap_df.head()
| Year | Country | SalaryMen | SalaryWomen | SalaryGap | SalaryGapPercent | |
|---|---|---|---|---|---|---|
| 0 | 2014 | Australia | 81355.140187 | 70000.0 | 11355.140187 | 13.957496 |
| 1 | 2014 | Canada | 71124.260355 | 57692.307692 | 13431.952663 | 18.885191 |
| 2 | 2014 | France | 54724.409449 | 47000.0 | 7724.409449 | 14.115108 |
| 3 | 2014 | Germany | 61491.596639 | 50714.285714 | 10777.310924 | 17.526478 |
| 4 | 2014 | Italy | 33923.076923 | 25000.0 | 8923.076923 | 26.303855 |
# Calculate the mean salary gap for all countries for all genders.
mean_salary_men_df = countries_df.query('Gender == "male"') \
.groupby(['Year', 'Country']) \
.agg({'Salary': 'mean' }) \
.reset_index() \
.rename(columns={'Salary': 'SalaryMen'})
mean_salary_women_df = countries_df.query('Gender == "female"') \
.groupby(['Year', 'Country']) \
.agg({'Salary': 'mean' }) \
.reset_index() \
.rename(columns={'Salary': 'SalaryWomen'})
mean_salary_gap_df = pd.merge(mean_salary_men_df, mean_salary_women_df, on=['Year', 'Country']).dropna()
mean_salary_gap_df['SalaryGap'] = abs(mean_salary_gap_df['SalaryMen'] - mean_salary_gap_df['SalaryWomen'])
mean_salary_gap_df['SalaryGapPercent'] = (mean_salary_gap_df['SalaryMen'] - mean_salary_gap_df['SalaryWomen']) / mean_salary_gap_df['SalaryMen'] * 100
mean_salary_gap = mean_salary_gap_df.groupby('Year', as_index=False)['SalaryGapPercent'].mean()
mean_salary_gap
| Year | SalaryGapPercent | |
|---|---|---|
| 0 | 2014 | 25.065 |
| 1 | 2015 | 23.649814 |
| 2 | 2016 | 18.958623 |
| 3 | 2017 | 14.428017 |
| 4 | 2018 | 14.927359 |
| 5 | 2019 | 16.585776 |
| 6 | 2020 | 18.866154 |
| 7 | 2021 | 21.314563 |
| 8 | 2022 | 16.113038 |
color_alpha = 0.7
fig = px.line(
salary_gap_df,
x='Year',
y='SalaryGapPercent',
color='Country',
hover_name='Country',
title='Top-10 Country Salary Gap Between Men and Women<br><sup>Generally decreasing, but still male-favoured</sup>',
custom_data=['Country'],
color_discrete_sequence=[
f'rgba(233, 146, 119, {color_alpha})',
f'rgba(148, 233, 119, {color_alpha})',
f'rgba(119, 127, 233, {color_alpha})',
f'rgba(218, 119, 233, {color_alpha})',
f'rgba(218, 191, 162, {color_alpha})',
f'rgba(207, 150, 150, {color_alpha})',
f'rgba(167, 199, 213, {color_alpha})',
f'rgba(233, 119, 186, {color_alpha})',
f'rgba(154, 162, 221, {color_alpha})',
f'rgba(154, 221, 209, {color_alpha})',
],
range_y=[0, 48],
width=790,
)
fig.update_layout(yaxis_title='Salary gap in percentage')
# Add male/female ratio in the legend names
total_male_participants_per_country_df = countries_df.query('Gender == "male"') \
.groupby('Country', as_index=False) \
.size() \
.rename(columns={'size': 'TotalMale'})
total_female_participants_per_country_df = countries_df.query('Gender == "female"') \
.groupby('Country', as_index=False) \
.size() \
.rename(columns={'size': 'TotalFemale'})
total_participants_per_country_df = pd.merge(total_male_participants_per_country_df, total_female_participants_per_country_df)
total_participants_per_country_df['MaleRatioPerFemale'] = total_participants_per_country_df['TotalMale'] / total_participants_per_country_df['TotalFemale']
# Add female/male ratio for each legend name.
# Example: {'oldname': 'newname'}
legend_names = {}
for _, row in total_participants_per_country_df.iterrows():
legend_names[row['Country']] = f"{row['Country']} (1/{int(row['MaleRatioPerFemale'])})"
fig.for_each_trace(lambda t: t.update(name=legend_names[t.name],
legendgroup=legend_names[t.name],
hovertemplate='<b>' + t.name + '</b><br>Year: %{x:d} </br>Salary gap: %{y:d}%<extra></extra>'))
# Plot the mean salary scatters
mean_salary_line_fig = px.line(mean_salary_gap,
x='Year',
y='SalaryGapPercent',
markers=True)
for trace in mean_salary_line_fig.data:
trace.hovertemplate = '<b>' + trace.name + '</b><br>Year: %{x:d} </br>Mean salary gap: %{y:d}%<extra></extra>'
fig.add_traces(list(mean_salary_line_fig.select_traces()))
fig.add_annotation(x=2019.7, y=17.75,
showarrow=False,
xanchor='center',
yanchor='bottom',
textangle=-16,
text='mean salary gap')
# Add caption
fig.add_annotation(x=-0.11, y=-0.35,
xref='paper', yref='paper',
showarrow=False,
xanchor='left', yanchor='bottom',
align='left',
text='Salary gaps across ten of the countries with the most survey respondents.<br>Positive percentages indicates a male-favoured salary gap.')
fig.update_layout(
legend={
'title': 'Country with<br>female/male ratio<br>'
},
margin={'b': 130, 't': 100, 'l': 90}
)
fig.show()